keep = ls()

###############################################
#Census 1860 All Indiana: Clean for prediction#
###############################################

census_1860_all = fread(paste0(ipums_census_path,'/indiana_1860.txt')) %>%
  .[Gender %in% "Male" &
      Race %in% c("White", "") &
      as.numeric(ResidenceAge) > 10]

#clean names
census_1860_all[, paste("match", 
                        c("first", "middle", "last", "first_clean"), 
                        sep = "_") := clean_names(first = Given,
                                                  middle = NULL,
                                                  last = Surname
                        )]
#get birth year
census_1860_all[, birth_year := 1860 - as.numeric(ResidenceAge)]

#Get matching attributes
census_1860_all = census_1860_all[, list(mpcid, 
                                         ResidenceCounty,
                                         ResidenceCity,
                                         match_first,
                                         match_middle,
                                         match_last,
                                         match_first_clean,
                                         first_sound = metaphone(match_first),
                                         last_sound = metaphone(match_last),
                                         birth_year,
                                         birth_place = BirthPlace
                                        )]

#merge in cleaned birth places
census_bp = fread('./raw/census_1860_in_birth_places.csv')
setkey(census_1860_all, birth_place)
setkey(census_bp, birth_place)
census_1860_all = census_bp[census_1860_all]
      
#Generate prediction file for fastText
census_pred_text = census_1860_all[,
                                   paste(
                                     ifelse(!is.na(match_first_clean), paste0("FN_", match_first_clean), ""),
                                     paste0("LN_", match_last),
                                     ifelse(!is.na(match_first), paste0("FN1_", match_first), ""),
                                     paste0("LN1_", match_last),
                                     ifelse(!is.na(first_sound), paste0("FS_", first_sound), ""),
                                     ifelse(!is.na(last_sound), paste0("LS_", last_sound), ""),
                                     ifelse(is.na(birth_year), "", paste0("BY_", birth_year)),
                                     ifelse(birth_place_clean == "", "", paste0("BP_", tolower(birth_place_clean))),
                                     sep = " ") %>% str_squish()]

cat(census_pred_text, sep = '\n', file = "./cleaned/fastText/data/party_predict_1860.txt")

########################################
#1874 Directories: Clean for prediction#
########################################

directory_1874_match = fread('./cleaned/directory_to_match.csv')

#Matched to veterans
cwdb_1874_matches = fread("./cleaned/cwdb_to_1874_links.csv")
matched_images = cwdb_1874_matches[(use), image_url] %>% unique

#Clean Party Label
directory_1874_match[, party_factor := "Other"]
directory_1874_match[str_detect(party, "[Rr]ep") | 
                       str_detect(party, "Union") | 
                       party %in% c("Radical", "Freesoiler"), party_factor := "Republican"]
directory_1874_match[str_detect(party, "[Dd]em") , party_factor := "Democrat"]
directory_1874_match[str_detect(party, "Grange") , party_factor := "Granger"]
directory_1874_match[str_detect(party, "Grange") , party_factor := "Granger"]
directory_1874_match[str_detect(party, "None") |
                       str_detect(party, "[nN]eut") |
                       party %in% c("", "Non-political"), party_factor := "Non-Response" ]
directory_1874_match[party_factor %in% c("Granger", "Independent"), party_factor := "Other"]
directory_1874_match[, party_label := as.factor(party_factor) %>% as.numeric %>% `-` (1) ]

#Create training data
#name, sound, 
#birth year
#birth place
directory_train = directory_1874_match[, 
                                       list(id = image_url,
                                            match_first, match_first_clean, match_middle, match_last,
                                            first_sound, last_sound,
                                            birth_year, birth_place_clean,
                                            census_county,
                                            party_factor,
                                            party_label,
                                            set = "train"
                                       )]
#exclude from training if matched to veteran
directory_train[id %in% matched_images, set := "validate"]

#Print training text for fastText
train_text = directory_train[, 
                             paste(paste0("__label__", party_label),
                                   ifelse(!is.na(match_first_clean), paste0("FN_", match_first_clean), ""),
                                   paste0("LN_", match_last),
                                   ifelse(!is.na(match_first), paste0("FN1_", match_first), ""),
                                   paste0("LN1_", match_last),
                                   ifelse(!is.na(first_sound), paste0("FS_", first_sound), ""),
                                   ifelse(!is.na(last_sound), paste0("LS_", last_sound), ""),
                                   ifelse(is.na(birth_year), "", paste0("BY_", birth_year)),
                                   ifelse(birth_place_clean == "", "", paste0("BP_", tolower(birth_place_clean))),
                                   #paste0("CC_", str_to_lower(census_county)) ,
                                   sep = " ") %>% str_squish()]


if (run_fasttext) {
  
#randomly split into 5 folds
directory_train[, fold := sample(rep(1:5, length.out = .N), .N , replace = F)]
  

#For 5 folds, save training/test set
for (i in 1:5) {
  ftrain = paste0("./cleaned/fastText/data/party_train_f",i)
  ftest = paste0("./cleaned/fastText/data/party_test_f",i)
  cat(train_text[directory_train$fold != i & directory_train$set %in% "train"], sep = '\n', file = ftrain)
  cat(train_text[directory_train$fold == i & directory_train$set %in% "train"], sep = '\n', file = ftest)
}

#Save validate set (matched to veteran)
cat(train_text[directory_train$set %in% 'validate'], sep = '\n', file = "./cleaned/fastText/data/party_validate")


###################
#FastText Training#
###################

#edit run_fasttext.sh
#set path to fastText function
paste0("sed -i 's|/path/to/fastText/function/fasttext|", 
       fasttext_path, 
       "|' ", 
       paste(getwd(),"make/run_fasttext.sh", sep = "/")
        ) %>%
  system(.)
#set path to replication file
paste0("sed -i 's|/path/to/replication/file|", 
       replication_file_path, 
       "|' ", 
       paste(getwd(),"make/run_fasttext.sh", sep = "/")
) %>%
  system(.)
#set path to fastText function


ft_path = paste(getwd(),"make/run_fasttext.sh", sep = "/")
system(paste('bash', ft_path))

}

#################################
#FastText Prediction Performance# 
#################################

#Plotting functions:

#Binomial smoother
binomial_smooth <- function(...) {
  geom_smooth(method = "glm", method.args = list(family = "binomial"), ...)
}

#AUC curves
gg_roc = function(class0, class1, label) {
  roc = roc.curve(scores.class0 = class0,
                  scores.class1 = class1, curve = T)
  ggplot(data.frame(roc$curve),aes(x=X1,y=X2,color=X3)) +         
    geom_line(size = 4, alpha = 1) + 
    labs(x="FPR",y="Sensitivity", title=paste0("ROC Curve: ",label , "\nAUC = ",  format(roc$auc,digits=3)), colour="Threshold") + 
    scale_colour_viridis_c(option = "inferno", alpha = 1) +
    theme_bw() + 
    geom_abline(slope = 1, intercept = 0)
}

#Precision-Recall Curves
gg_pr = function(class0, class1, label) {
  pr = pr.curve(scores.class0 = class0,
                scores.class1 = class1, curve = T)
  ggplot(data.frame(pr$curve),aes(x=X1,y=X2,color=X3)) +         
    geom_line(size = 4, alpha = 1) + 
    labs(x="Recall",y="Precision", title=paste0("PR Curve: ",label , "\nAUC = ",  format(pr$auc.integral,digits=3)), colour="Threshold") + 
    scale_colour_viridis_c(option = "inferno", alpha = 1) +
    theme_bw() + 
    geom_hline(yintercept = length(class0)/length(c(class0,class1))) +
    ylim(c(0,1))
}


##################################
#Prediction Performance: Test Set#
##################################

#check performance of "out of fold" or TEST set

#Combine predictions with actual partisanship for 5 out-of-fold groups
test_probs = vector('list', length = 5)
for (i in 1:5) {
  l_file = paste0('./cleaned/fastText/data/party_test_f',i)
  p_file = paste0('./cleaned/fastText/predict/party_test_f',i,'_probs.txt')
  temp = fread(p_file)
  temp_l = fread(l_file, sep = '|', header = F)
  temp_labels = temp_l$V1 %>% 
    str_extract("(?<=[_]{2}label[_]{2})[0-3]") %>%
    as.numeric
  temp[, id := paste(i, 1:.N, sep = "_")]
  temp[, label := temp_labels]
  temp = rbindlist(list(
    dcast(temp, id + label ~ V1, value.var = 'V2', fun = sum),
    dcast(temp, id + label ~ V3, value.var = 'V4', fun = sum),
    dcast(temp, id + label ~ V5, value.var = 'V6', fun = sum),
    dcast(temp, id + label ~ V7, value.var = 'V8', fun = sum)
  ), fill = T) %>% 
    .[, paste0("__label__", 0:3) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = paste0("__label__", 0:3)] %>%
    .[, lapply(.SD, sum), by = list(id, label), .SDcols = paste0("__label__", 0:3)]
  test_probs[[i]] = temp 
}  

test_probs = rbindlist(test_probs)
setnames(test_probs, paste0("__label__", 0:3), paste0("prob_", 0:3))

#Make Calibration Plots
a = ggplot(test_probs, aes(x = prob_0, y = 1*(label == 0))) +
  geom_jitter(alpha = 0.01, height = 0.025) + 
  binomial_smooth(formula = y ~ splines::ns(x, 3)) + 
  theme_bw() +
  xlab("Predicted Pr(Democrat)") +
  ylab("Ground Truth: Democrat")
b = ggplot(test_probs, aes(x = prob_3, y = 1*(label == 3))) +
  geom_jitter(alpha = 0.01, height = 0.025) + 
  binomial_smooth(formula = y ~ splines::ns(x, 3)) + 
  theme_bw() +
  xlab("Predicted Pr(Republican)") +
  ylab("Ground Truth: Republican") 

p = ggarrange(a,b, 
              labels = c("A", "B"),
              ncol = 2, nrow = 1)
ggsave('../../output/appendix/Figure_B6.pdf', p, width = 8, height = 6)

#Make AUC/PR Curve plot
auc_list = list(
  gg_roc(test_probs[label == 0, prob_0],
         test_probs[label !=0, prob_0],
         "Democrat"),
  gg_pr(test_probs[label == 0, prob_0],
        test_probs[label !=0, prob_0],
        "Democrat"),
  gg_roc(test_probs[label == 3, prob_3],
         test_probs[label !=3, prob_3],
         "Republican"),
  gg_pr(test_probs[label == 3, prob_3],
        test_probs[label !=3, prob_3],
        "Republican")
)

pdf('../../output/appendix/Figure_B5.pdf', width = 8.5, height = 8.5)
grid.arrange( 
  arrangeGrob(grobs = auc_list, 
              nrow = 2, ncol = 2) 
)
dev.off()


######################################
#Prediction Performance: Validate Set#
######################################

#check performance on matched-to-soldiers or Validate set

validate_probs = vector('list', length = 5)
for (i in 1:5) {
  p_file = paste0('./cleaned/fastText/predict/party_validate_f',i,'_probs.txt')
  temp = fread(p_file)
  temp[, id := directory_train[set == "validate", id]]
  temp[, label := directory_train[set == "validate", party_label]]
  temp = rbindlist(list(
    dcast(temp, id + label ~ V1, value.var = 'V2', fun = sum),
    dcast(temp, id + label ~ V3, value.var = 'V4', fun = sum),
    dcast(temp, id + label ~ V5, value.var = 'V6', fun = sum),
    dcast(temp, id + label ~ V7, value.var = 'V8', fun = sum)
  ), fill = T) %>% 
    .[, paste0("__label__", 0:3) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = paste0("__label__", 0:3)] %>%
    .[, lapply(.SD, function(x) sum(x, na.rm = T)), by = list(id, label), .SDcols = paste0("__label__", 0:3)] %>%
    .[, model := i]
  validate_probs[[i]] = temp 
}  

validate_probs = rbindlist(validate_probs)
setnames(validate_probs, paste0('__label__', 0:3), paste0('prob_', 0:3))
validate_probs = validate_probs[, lapply(.SD, mean), by = list(id, label), .SDcols = paste0('prob_', 0:3)]

#Make Calibration Plots
a = ggplot(validate_probs, aes(x = prob_0, y = 1*(label == 0))) +
  geom_jitter(alpha = 0.01, height = 0.025) + 
  binomial_smooth(formula = y ~ splines::ns(x, 3)) + 
  theme_bw() +
  xlab("Predicted Pr(Democrat)") +
  ylab("Ground Truth: Democrat")
b = ggplot(validate_probs, aes(x = prob_3, y = 1*(label == 3))) +
  geom_jitter(alpha = 0.01, height = 0.025) +
  binomial_smooth(formula = y ~ splines::ns(x, 3)) + 
  theme_bw() +
  xlab("Predicted Pr(Republican)") +
  ylab("Ground Truth: Republican") 

p = ggarrange(a,b, 
          labels = c("A", "B"),
          ncol = 2, nrow = 1)
ggsave('../../output/appendix/Figure_B8.pdf', p, width = 8, height = 6)

#Make AUC/PR Curve plot

auc_list = list(
  gg_roc(validate_probs[label == 0, prob_0],
         validate_probs[label !=0, prob_0],
         "Democrat"),
  gg_pr(validate_probs[label == 0, prob_0],
        validate_probs[label !=0, prob_0],
        "Democrat"),
  gg_roc(validate_probs[label == 3, prob_3],
         validate_probs[label !=3, prob_3],
         "Republican"),
  gg_pr(validate_probs[label == 3, prob_3],
        validate_probs[label !=3, prob_3],
        "Republican")
)

pdf('../../output/appendix/Figure_B7.pdf', width = 8.5, height = 8.5)
grid.arrange( 
  arrangeGrob(grobs = auc_list, 
              nrow = 2, ncol = 2) 
)
dev.off()


########################################
#Prediction Performance: Prediction Set#
########################################

#fastText performance in Indiana Counties, Townships

predict_probs = vector('list', length = 5)
for (i in 1:5) {
  p_file = paste0('./cleaned/fastText/predict/party_predict_f',i,'_probs.txt')
  temp = fread(p_file)
  temp[, id := census_1860_all[, mpcid]]
  temp = rbindlist(list(
    dcast(temp, id ~ V1, value.var = 'V2', fun = sum),
    dcast(temp, id ~ V3, value.var = 'V4', fun = sum),
    dcast(temp, id ~ V5, value.var = 'V6', fun = sum),
    dcast(temp, id ~ V7, value.var = 'V8', fun = sum)
  ), fill = T) %>% 
    .[, lapply(.SD, function(x) sum(x, na.rm = T)), by = id, .SDcols = paste0("__label__", 0:3)]
  predict_probs[[i]] = temp 
}  
predict_probs = rbindlist(predict_probs)
predict_probs = predict_probs[, lapply(.SD, mean), by = list(id), .SDcols = paste0('__label__', 0:3)]
setnames(predict_probs, paste0('__label__', 0:3), paste0('prob_', 0:3))

#Create County level averages
c_counties = census_1860_all[, list(mpcid, ResidenceCounty, ResidenceCity)]
setkey(c_counties, mpcid)
setkey(predict_probs, id)

predict_probs = c_counties[predict_probs]

#Calculate predicted democrats/republicans
over_21 = census_1860_all[(1860 - birth_year) >= 21, mpcid ]
county_preds = predict_probs[mpcid %in% over_21, list(dem_pred_n = sum(prob_0),
                                                      rep_pred_n = sum(prob_3),
                                                      other_pred_n = sum(prob_2),
                                                      none_pred_n = sum(prob_1),
                                                      n = .N), by = ResidenceCounty]

#Add in elections data 1860
elections = fread('../county_elections/cleaned/elections_wide.csv') %>%
  .[target_state %in% "INDIANA", 
    list(target_name, 
         p_gop_n = pre_vs_gop_1860_PRES * total_vote_1860_PRES,
         p_dem_n = (pre_vs_0100_1860_PRES + pre_vs_0604_1860_PRES + pre_vs_0037_1860_PRES) * total_vote_1860_PRES,
         c_gop_n = pre_vs_gop_1860_CONG * total_vote_1860_CONG, 
         c_dem_n = pre_vs_0100_1860_CONG * total_vote_1860_CONG)]

setkey(county_preds, ResidenceCounty)
all_indiana = cbind(county_preds, elections)

#calculate true and predicted partisanship
all_indiana[, p_dem_1860 := p_dem_n / n]
all_indiana[, p_rep_1860 := p_gop_n / n]
all_indiana[, c_dem_1860 := c_dem_n / n]
all_indiana[, c_rep_1860 := c_gop_n / n]
all_indiana[, dem_pred := dem_pred_n / n]
all_indiana[, rep_pred := rep_pred_n / n]
all_indiana[, other_pred := other_pred_n / n]
all_indiana[, none_pred := none_pred_n / n]

#Subset of training counties
use_counties = c('Bartholomew', 'Boone', 'Hamilton', 'Hendricks', 'Henry', 'Johnson', 'Montgomery', 'Morgan', 'Vermillion')
all_indiana[, use := ResidenceCounty %in% use_counties]

#Make County Prediction Plot
############################

#all indiana
a1 = ggplot(all_indiana, aes(x = dem_pred ,
                             y = (c_dem_1860 + p_dem_1860)/2,
                             size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Democrat (Predicted)") + 
  ylab("Democrat (Actual)")

a2 = ggplot(all_indiana, aes(x = rep_pred ,
                             y = (c_rep_1860 + p_rep_1860)/2,
                             size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Republican (Predicted)") + 
  ylab("Republican (Actual)")

a3 = ggplot(all_indiana, aes(x = rep_pred - dem_pred,
                             y = (c_rep_1860 + p_rep_1860)/2 - (c_dem_1860 + p_dem_1860) /2,
                             size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Republican - Democrat (Predicted)") + 
  ylab("Republican - Democrat (Actual)")

#9 directory counties
b1 = ggplot(all_indiana[(use)], aes(x = dem_pred ,
                                    y = (c_dem_1860 + p_dem_1860)/2,
                                    size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Democrat (Predicted)") + 
  ylab("Democrat (Actual)")

b2 = ggplot(all_indiana[(use)], aes(x = rep_pred ,
                                    y = (c_rep_1860 + p_rep_1860)/2,
                                    size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Republican (Predicted)") + 
  ylab("Republican (Actual)")

b3 = ggplot(all_indiana[(use)], aes(x = rep_pred - dem_pred,
                                    y = (c_rep_1860 + p_rep_1860)/2 - (c_dem_1860 + p_dem_1860) /2,
                                    size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Republican - Democrat (Predicted)") + 
  ylab("Republican - Democrat (Actual)") 

p = ggarrange(a1,a2,a3,b1,b2,b3, 
          labels = outer(letters[1:2], 1:3, paste0) %>% as.vector() %>% sort,
          ncol = 3, nrow = 2, common.legend = T, legend = 'none')
ggsave('../../output/appendix/Figure_B9.pdf', p, width = 8.5, height = 7)



#Make Township Prediction Plot
##############################

twp_xwalk = fread("./raw/in_twp_xwalk.csv")
twp_elections = fread("./raw/gt_in_townships.csv")

setkey(twp_xwalk, ResidenceCounty, ResidenceCity)
setkey(predict_probs, ResidenceCounty, ResidenceCity)

#True and predicted 1860 VS
twp_predict = predict_probs[twp_xwalk] %>%
  .[mpcid %in% over_21, list(
    dem_pred_n = sum(prob_0),
    rep_pred_n = sum(prob_3),
    other_pred_n = sum(prob_2),
    none_pred_n = sum(prob_1),
    n = .N
  ), by = list(ResidenceCounty, Township)]

setkey(twp_elections, county, twp)
twp_predict = twp_elections[, list(p_dem_n = douglas + breckinridge + bell, 
                                   p_dem_alt_n = douglas,
                                   p_rep_n = lincoln,
                                   p_diff = diff
)] %>%
  cbind(twp_predict, .)

twp_predict[is.na(p_diff), p_diff := p_rep_n - p_dem_alt_n]

twp_predict[, p_dem_1860 := p_dem_n / (p_dem_n + p_rep_n)]
twp_predict[, p_dem_alt_1860 := p_dem_alt_n / n]
twp_predict[, p_rep_1860 := p_rep_n / (p_dem_n + p_rep_n)]
twp_predict[, p_diff_1860 := p_diff / n]
twp_predict[, dem_pred := dem_pred_n / n]
twp_predict[, rep_pred := rep_pred_n / n]

#Make Township Prediction plots
a = ggplot(twp_predict[(p_rep_1860 < 1)], aes(x = rep_pred ,
                                              y = p_rep_1860,
                                              size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Republican (Predicted)") + 
  ylab("Republican (Actual)")
b = ggplot(twp_predict[(p_rep_1860 < 1)], aes(x = dem_pred ,
                                              y = p_dem_1860,
                                              size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Democrat (Predicted)") + 
  ylab("Democrat (Actual)")

c =  ggplot(twp_predict[(p_diff_1860 < 1)], aes(x = rep_pred - dem_pred ,
                                                y = p_diff_1860,
                                                size = n)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = 'lm') +
  theme_bw() +
  stat_cor() +
  geom_abline(intercept = 0, slope = 1) +
  xlab("Republican - Democrat (Predicted)") + 
  ylab("Republican - Democrat (Actual)")


p = ggarrange(a,b,c, 
          labels = letters[1:3],
          ncol = 3, nrow = 1, common.legend = T, legend = 'none')
ggsave('../../output/appendix/Figure_B10.pdf', p, width = 8.5, height = 5)


#############################################################
#Generate Predicted Partisanship for Census-Matched Soldiers#
#############################################################

cwdb_census_xwalk = fread('./cleaned/cwdb_to_census_links.csv')
#Save predicted partisanship for census lines matched to soldiers
predict_probs[mpcid %in% cwdb_census_xwalk$mpcid] %>% 
  fwrite('./cleaned/census_predicted_partisanship.csv')

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()